package org.spider.utils;

import org.apache.commons.lang3.StringUtils;
import java.io.UnsupportedEncodingException;
import java.net.MalformedURLException;
import java.net.URL;
import java.net.URLDecoder;
import java.net.URLEncoder;
import java.nio.charset.Charset;
import java.nio.charset.StandardCharsets;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

/**
 * url and html utils.
 *
 * @author code4crafter@gmail.com <br>
 * @since 0.1.0
 */
public class UrlUtils {

    /**
     * canonicalizeUrl
     * <br>
     * Borrowed from Jsoup.
     *
     * @param url url
     * @param refer refer
     * @return canonicalizeUrl
     */
    public static String canonicalizeUrl(String url, String refer) {
        URL base;
        try {
            try {
                base = new URL(refer);
            } catch (MalformedURLException e) {
                // the base is unsuitable, but the attribute may be abs on its own, so try that
                URL abs = new URL(refer);
                return abs.toExternalForm();
            }
            // workaround: java resolves '//path/file + ?foo' to '//path/?foo', not '//path/file?foo' as desired
            if (url.startsWith("?")) {
                url = base.getPath() + url;
            }
            URL abs = new URL(base, url);
            return abs.toExternalForm();
        } catch (MalformedURLException e) {
            return "";
        }
    }

    /**
     *
     * @param url url
     * @return new url
     * @deprecated
     */
    public static String encodeIllegalCharacterInUrl(String url) {
        return url.replace(" ", "%20");
    }

    public static String fixIllegalCharacterInUrl(String url) {
        //TODO more charator support
        URLEncoder.encode(url, StandardCharsets.UTF_8);
        return url.replace(" ", "%20").replaceAll("#+", "#");
    }

    public static String urlEncoder(String url, String charset) {
        url = url.replaceAll(" ", "%20");
        url = url.replaceAll(" ", "%C2%A0");
        url = url.replaceAll("\\[", "%5B");
        url = url.replaceAll("\\]", "%5D");
        url = url.replaceAll("\\{", "%7B");
        url = url.replaceAll("\\}", "%7D");
        url = url.replaceAll("\\|", "%7C");
        url = url.replaceAll("\\^", "%5E");
        url = url.replaceAll("\"", "%22");
        url = url.replaceAll("#", "%23");
        url = url.replaceAll("%", "%25");
        url = url.replaceAll("=", "%3D");
        url = url.replaceAll("&", "%26");
        url = url.replaceAll("\\+", "%2B");
        String mm = "";
        try {
            mm = URLDecoder.decode("%C2%92", "utf-8");
            String mm1 = URLDecoder.decode("%C2%99", "utf-8");
            url = url.replaceAll(mm, "%C2%92");
            url = url.replaceAll(mm1, "%C2%92");
        } catch (UnsupportedEncodingException e) {
            throw new RuntimeException("URL replace failed !!!", e);
        }
        StringBuilder sb = new StringBuilder();
        char[] charArray = url.toCharArray();
        for (char c : charArray) {
            if (isChinese(c)) {
                try {
                    sb.append(URLEncoder.encode(c + "", charset));
                } catch (UnsupportedEncodingException e) {
                    throw new RuntimeException("chinese append failed !!!", e);
                }
            } else {
                sb.append(c + "");
            }
        }
        return sb.toString();
    }

    private static boolean isChinese(char c) {
        Character.UnicodeBlock ub = Character.UnicodeBlock.of(c);
        return ub == Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS
                || ub == Character.UnicodeBlock.CJK_COMPATIBILITY_IDEOGRAPHS
                || ub == Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS_EXTENSION_A
                || ub == Character.UnicodeBlock.GENERAL_PUNCTUATION
                || ub == Character.UnicodeBlock.CJK_SYMBOLS_AND_PUNCTUATION
                || ub == Character.UnicodeBlock.HALFWIDTH_AND_FULLWIDTH_FORMS;
    }

    public static String getHost(String url) {
        String host = url;
        int i = StringUtils.ordinalIndexOf(url, "/", 3);
        if (i > 0) {
            host = StringUtils.substring(url, 0, i);
        }
        return host;
    }

    private static Pattern patternForProtocal = Pattern.compile("[\\w]+://");

    public static String removeProtocol(String url) {
        return patternForProtocal.matcher(url).replaceAll("");
    }

    public static String getDomain(String url) {
        String domain = removeProtocol(url);
        int i = StringUtils.indexOf(domain, "/", 1);
        if (i > 0) {
            domain = StringUtils.substring(domain, 0, i);
        }
        return removePort(domain);
    }

    public static String removePort(String domain) {
        int portIndex = domain.indexOf(":");
        if (portIndex != -1) {
            return domain.substring(0, portIndex);
        }else {
            return domain;
        }
    }

    private static final Pattern patternForCharset = Pattern.compile("charset\\s*=\\s*['\"]*([^\\s;'\"]*)", Pattern.CASE_INSENSITIVE);

    public static String getCharset(String contentType) {
        Matcher matcher = patternForCharset.matcher(contentType);
        if (matcher.find()) {
            String charset = matcher.group(1);
            if (Charset.isSupported(charset)) {
                return charset;
            }
        }
        return null;
    }

}
